{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:18.057071Z",
     "start_time": "2020-03-21T07:37:17.413598Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "from sklearn import preprocessing, metrics\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold\n",
    "import gc\n",
    "import lightgbm as lgb\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from joblib import Parallel, delayed\n",
    "from sklearn.utils import shuffle\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', None)\n",
    "\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:18.060952Z",
     "start_time": "2020-03-21T07:37:18.058837Z"
    }
   },
   "outputs": [],
   "source": [
    "seed = 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:18.075927Z",
     "start_time": "2020-03-21T07:37:18.062106Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature = pd.read_pickle('./temp/part2_feature.plk')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:18.145736Z",
     "start_time": "2020-03-21T07:37:18.077160Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>courier_id</th>\n",
       "      <th>wave_index</th>\n",
       "      <th>tracking_id</th>\n",
       "      <th>courier_wave_start_lng</th>\n",
       "      <th>courier_wave_start_lat</th>\n",
       "      <th>action_type</th>\n",
       "      <th>expect_time</th>\n",
       "      <th>date</th>\n",
       "      <th>type</th>\n",
       "      <th>target</th>\n",
       "      <th>group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>98263</td>\n",
       "      <td>10330725</td>\n",
       "      <td>9</td>\n",
       "      <td>2100075923187730175</td>\n",
       "      <td>121.481429</td>\n",
       "      <td>39.299365</td>\n",
       "      <td>PICKUP</td>\n",
       "      <td>1582888651</td>\n",
       "      <td>20200228</td>\n",
       "      <td>train</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20200228103307259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>116276</td>\n",
       "      <td>10053442</td>\n",
       "      <td>1</td>\n",
       "      <td>2100075314534647435</td>\n",
       "      <td>121.479587</td>\n",
       "      <td>39.248115</td>\n",
       "      <td>PICKUP</td>\n",
       "      <td>1582084880</td>\n",
       "      <td>20200219</td>\n",
       "      <td>train</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20200219100534421</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>153284</td>\n",
       "      <td>118787313</td>\n",
       "      <td>4</td>\n",
       "      <td>2100075078536791439</td>\n",
       "      <td>121.440498</td>\n",
       "      <td>39.203471</td>\n",
       "      <td>DELIVERY</td>\n",
       "      <td>1581671584</td>\n",
       "      <td>20200214</td>\n",
       "      <td>train</td>\n",
       "      <td>1.0</td>\n",
       "      <td>202002141187873134</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>16134</td>\n",
       "      <td>116706233</td>\n",
       "      <td>3</td>\n",
       "      <td>2100074825841346124</td>\n",
       "      <td>121.543010</td>\n",
       "      <td>39.258822</td>\n",
       "      <td>DELIVERY</td>\n",
       "      <td>1581075896</td>\n",
       "      <td>20200207</td>\n",
       "      <td>train</td>\n",
       "      <td>1.0</td>\n",
       "      <td>202002071167062333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>23009</td>\n",
       "      <td>118333873</td>\n",
       "      <td>5</td>\n",
       "      <td>2100074746653279446</td>\n",
       "      <td>121.406669</td>\n",
       "      <td>39.364738</td>\n",
       "      <td>PICKUP</td>\n",
       "      <td>1580906387</td>\n",
       "      <td>20200205</td>\n",
       "      <td>train</td>\n",
       "      <td>1.0</td>\n",
       "      <td>202002051183338735</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  courier_id  wave_index          tracking_id  \\\n",
       "0   98263    10330725           9  2100075923187730175   \n",
       "1  116276    10053442           1  2100075314534647435   \n",
       "2  153284   118787313           4  2100075078536791439   \n",
       "3   16134   116706233           3  2100074825841346124   \n",
       "4   23009   118333873           5  2100074746653279446   \n",
       "\n",
       "   courier_wave_start_lng  courier_wave_start_lat action_type  expect_time  \\\n",
       "0              121.481429               39.299365      PICKUP   1582888651   \n",
       "1              121.479587               39.248115      PICKUP   1582084880   \n",
       "2              121.440498               39.203471    DELIVERY   1581671584   \n",
       "3              121.543010               39.258822    DELIVERY   1581075896   \n",
       "4              121.406669               39.364738      PICKUP   1580906387   \n",
       "\n",
       "       date   type  target               group  \n",
       "0  20200228  train     1.0   20200228103307259  \n",
       "1  20200219  train     1.0   20200219100534421  \n",
       "2  20200214  train     1.0  202002141187873134  \n",
       "3  20200207  train     1.0  202002071167062333  \n",
       "4  20200205  train     1.0  202002051183338735  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:18.164458Z",
     "start_time": "2020-03-21T07:37:18.146889Z"
    }
   },
   "outputs": [],
   "source": [
    "df_test = df_feature[df_feature['type'] == 'test'].copy()\n",
    "df_train = df_feature[df_feature['type'] == 'train'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:18.169553Z",
     "start_time": "2020-03-21T07:37:18.166027Z"
    }
   },
   "outputs": [],
   "source": [
    "prediction = df_test[['courier_id', 'wave_index', 'tracking_id',\n",
    "                      'courier_wave_start_lng', 'courier_wave_start_lat', 'action_type', 'expect_time', 'date']]\n",
    "prediction['expect_time'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:18.369514Z",
     "start_time": "2020-03-21T07:37:18.170937Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "action_type\n",
      "group\n"
     ]
    }
   ],
   "source": [
    "for f in df_feature.select_dtypes('object'):\n",
    "    if f not in ['date', 'type']:\n",
    "        print(f)\n",
    "        lbl = LabelEncoder()\n",
    "        lbl = lbl.fit(df_train[f].astype(\n",
    "            str).values.tolist()+df_test[f].astype(str).values.tolist())\n",
    "        df_train[f] = lbl.transform(df_train[f].astype(str))\n",
    "        df_test[f] = lbl.transform(df_test[f].astype(str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:20.190499Z",
     "start_time": "2020-03-21T07:37:18.370845Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Fold_1 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[59]\ttrain's l1: 609530\tvalid's l1: 614571\n",
      "\n",
      "Fold_2 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[88]\ttrain's l1: 609419\tvalid's l1: 613779\n",
      "\n",
      "Fold_3 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[99]\ttrain's l1: 608459\tvalid's l1: 613229\n",
      "\n",
      "Fold_4 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[203]\ttrain's l1: 607344\tvalid's l1: 611899\n",
      "\n",
      "Fold_5 Training ================================\n",
      "\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "Early stopping, best iteration is:\n",
      "[203]\ttrain's l1: 608302\tvalid's l1: 611439\n"
     ]
    }
   ],
   "source": [
    "ycol = 'expect_time'\n",
    "feature_names = list(\n",
    "    filter(lambda x: x not in [ycol, 'id', 'wave_index', 'tracking_id', 'target', 'date', 'type', 'group',\n",
    "                               'courier_wave_start_lng', 'courier_wave_start_lat'], df_train.columns))\n",
    "\n",
    "model = lgb.LGBMRegressor(num_leaves=64,\n",
    "                          max_depth=10,\n",
    "                          learning_rate=0.1,\n",
    "                          n_estimators=10000000,\n",
    "                          subsample=0.8,\n",
    "                          feature_fraction=0.8,\n",
    "                          reg_alpha=0.5,\n",
    "                          reg_lambda=0.5,\n",
    "                          random_state=seed,\n",
    "                          metric=None\n",
    "                          )\n",
    "\n",
    "\n",
    "oof = []\n",
    "df_importance_list = []\n",
    "\n",
    "kfold = GroupKFold(n_splits=5)\n",
    "for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol], df_train['group'])):\n",
    "    X_train = df_train.iloc[trn_idx][feature_names]\n",
    "    Y_train = df_train.iloc[trn_idx][ycol]\n",
    "\n",
    "    X_val = df_train.iloc[val_idx][feature_names]\n",
    "    Y_val = df_train.iloc[val_idx][ycol]\n",
    "\n",
    "    print('\\nFold_{} Training ================================\\n'.format(fold_id+1))\n",
    "\n",
    "    lgb_model = model.fit(X_train,\n",
    "                          Y_train,\n",
    "                          eval_names=['train', 'valid'],\n",
    "                          eval_set=[(X_train, Y_train), (X_val, Y_val)],\n",
    "                          verbose=500,\n",
    "                          eval_metric='mae',\n",
    "                          early_stopping_rounds=50)\n",
    "\n",
    "    pred_val = lgb_model.predict(\n",
    "        X_val, num_iteration=lgb_model.best_iteration_)\n",
    "    df_oof = df_train.iloc[val_idx][['id', ycol]].copy()\n",
    "    df_oof['pred'] = pred_val\n",
    "    oof.append(df_oof)\n",
    "\n",
    "    pred_test = lgb_model.predict(\n",
    "        df_test[feature_names], num_iteration=lgb_model.best_iteration_)\n",
    "    prediction['expect_time'] += pred_test / 5\n",
    "\n",
    "    df_importance = pd.DataFrame({\n",
    "        'column': feature_names,\n",
    "        'importance': lgb_model.feature_importances_,\n",
    "    })\n",
    "    df_importance_list.append(df_importance)\n",
    "\n",
    "    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:20.201040Z",
     "start_time": "2020-03-21T07:37:20.191762Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>courier_id</td>\n",
       "      <td>5506.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>action_type</td>\n",
       "      <td>1916.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        column  importance\n",
       "0   courier_id      5506.6\n",
       "1  action_type      1916.4"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_importance = pd.concat(df_importance_list)\n",
    "df_importance = df_importance.groupby(['column'])['importance'].agg(\n",
    "    'mean').sort_values(ascending=False).reset_index()\n",
    "df_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:20.207336Z",
     "start_time": "2020-03-21T07:37:20.202045Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mae: 612983.2239464395\n"
     ]
    }
   ],
   "source": [
    "df_oof = pd.concat(oof)\n",
    "mae = metrics.mean_absolute_error(df_oof[ycol], df_oof['pred'])\n",
    "print('mae:', mae)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:20.268041Z",
     "start_time": "2020-03-21T07:37:20.208458Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>courier_id</th>\n",
       "      <th>wave_index</th>\n",
       "      <th>tracking_id</th>\n",
       "      <th>courier_wave_start_lng</th>\n",
       "      <th>courier_wave_start_lat</th>\n",
       "      <th>action_type</th>\n",
       "      <th>expect_time</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>44058</th>\n",
       "      <td>10012508</td>\n",
       "      <td>1</td>\n",
       "      <td>2100076472664917076</td>\n",
       "      <td>121.549495</td>\n",
       "      <td>39.289891</td>\n",
       "      <td>DELIVERY</td>\n",
       "      <td>1.581864e+09</td>\n",
       "      <td>20200306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44059</th>\n",
       "      <td>10012508</td>\n",
       "      <td>5</td>\n",
       "      <td>2100076518544770291</td>\n",
       "      <td>121.529386</td>\n",
       "      <td>39.297263</td>\n",
       "      <td>DELIVERY</td>\n",
       "      <td>1.581864e+09</td>\n",
       "      <td>20200306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44060</th>\n",
       "      <td>10021791</td>\n",
       "      <td>1</td>\n",
       "      <td>2100076474713671215</td>\n",
       "      <td>121.451866</td>\n",
       "      <td>39.219517</td>\n",
       "      <td>PICKUP</td>\n",
       "      <td>1.581948e+09</td>\n",
       "      <td>20200306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44061</th>\n",
       "      <td>10037225</td>\n",
       "      <td>1</td>\n",
       "      <td>2100076471997432842</td>\n",
       "      <td>121.485061</td>\n",
       "      <td>39.150327</td>\n",
       "      <td>PICKUP</td>\n",
       "      <td>1.581948e+09</td>\n",
       "      <td>20200306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44062</th>\n",
       "      <td>10037225</td>\n",
       "      <td>3</td>\n",
       "      <td>2100076506867894139</td>\n",
       "      <td>121.525111</td>\n",
       "      <td>39.151175</td>\n",
       "      <td>DELIVERY</td>\n",
       "      <td>1.581911e+09</td>\n",
       "      <td>20200306</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       courier_id  wave_index          tracking_id  courier_wave_start_lng  \\\n",
       "44058    10012508           1  2100076472664917076              121.549495   \n",
       "44059    10012508           5  2100076518544770291              121.529386   \n",
       "44060    10021791           1  2100076474713671215              121.451866   \n",
       "44061    10037225           1  2100076471997432842              121.485061   \n",
       "44062    10037225           3  2100076506867894139              121.525111   \n",
       "\n",
       "       courier_wave_start_lat action_type   expect_time      date  \n",
       "44058               39.289891    DELIVERY  1.581864e+09  20200306  \n",
       "44059               39.297263    DELIVERY  1.581864e+09  20200306  \n",
       "44060               39.219517      PICKUP  1.581948e+09  20200306  \n",
       "44061               39.150327      PICKUP  1.581948e+09  20200306  \n",
       "44062               39.151175    DELIVERY  1.581911e+09  20200306  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:20.323367Z",
     "start_time": "2020-03-21T07:37:20.269183Z"
    }
   },
   "outputs": [],
   "source": [
    "import zipfile\n",
    "os.makedirs('./sub/{}'.format(int(mae)), exist_ok=True)\n",
    "f = zipfile.ZipFile('./sub/{}.zip'.format(int(mae)), 'w', zipfile.ZIP_DEFLATED)\n",
    "for date in prediction['date'].unique():\n",
    "    df_temp = prediction[prediction['date'] == date]\n",
    "    del df_temp['date']\n",
    "    df_temp.to_csv('./sub/{}/action_{}.txt'.format(int(mae), date), index=False)\n",
    "    f.write('./sub/{}/action_{}.txt'.format(int(mae), date), 'action_{}.txt'.format(date))\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-03-21T07:37:20.386808Z",
     "start_time": "2020-03-21T07:37:20.324551Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n20200306 639\\n20200303 804\\n20200302 582\\n20200304 552\\n20200305 793\\n20200301 849\\n'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "20200306 639\n",
    "20200303 804\n",
    "20200302 582\n",
    "20200304 552\n",
    "20200305 793\n",
    "20200301 849\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:dm] *",
   "language": "python",
   "name": "conda-env-dm-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
